The Stack Overflow Developer Survey 2020 was applied from February 5th to the 28th. There were a total of 64461 respondents for a total of 61 questions. However, less than 30 questions are considered for this analysis.
The main objective of this analysis is to explore who are the respondents, such as how they use code professionally, their country, academic background, technologies they work with, etc.
The original dataset can be downloaded from Stack Overflow’s website here.
require(RPostgreSQL)
## Loading required package: RPostgreSQL
## Loading required package: DBI
require(ini)
## Loading required package: ini
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Load host, database, user and password information stored in the "postgresql" group
db_info <- read.ini("db.ini")$postgresql
db_driver <- dbDriver("PostgreSQL")
db_connection <- dbConnect(
db_driver,
dbname = db_info$database,
host = db_info$host,
user = db_info$user,
password = db_info$password
)
query_str <- "
SELECT gender, COUNT(gender) AS gender_count
FROM respondents
WHERE gender IS NOT NULL
GROUP BY gender;
"
df <- dbGetQuery(db_connection, query_str)
fig <- plot_ly() %>%
add_pie(
data = df,
labels = df$gender,
values = df$gender_count
) %>%
layout(title = "Gender of Respondents")
fig
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
query_str <- "
SELECT age, COUNT(age) AS age_count
FROM respondents
WHERE age BETWEEN 10 AND 80
GROUP BY age;
"
df <- dbGetQuery(db_connection, query_str)
# Relative frequencies for the key takeaways (single-choice question)
age_20_30 <- sum(df[df$age >= 20 & df$age <=30,"age_count"] / sum(df$age_count))
fig <- plot_ly() %>%
add_bars(
data = df,
x = df$age,
y = df$age_count
) %>%
layout(title = "Age of Respondents")
fig
query_str <- "
SELECT country, COUNT(country) AS country_count
FROM respondents
GROUP BY country
ORDER BY country_count DESC
LIMIT 15;
"
df <- dbGetQuery(db_connection, query_str)
df <- df[order(df$country_count),]
fig <- plot_ly() %>%
add_bars(
data = df,
x = df$country_count,
y = df$country
) %>%
layout(
title = "Country of Respondents",
yaxis = list(
categoryorder = "array",
categoryarray = df$country_count
)
)
fig
query_str <- "
SELECT ed_level, COUNT(ed_level) AS ed_level_count
FROM respondents
WHERE ed_level IS NOT NULL
GROUP BY ed_level
ORDER BY ed_level_count DESC;
"
df <- dbGetQuery(db_connection, query_str)
df <- df[order(df$ed_level_count),]
# Relative frequencies for the key takeaways (single-choice question)
ed_level_bachelor <- df[df$ed_level == "Bachelor's degree","ed_level_count"] / sum(df$ed_level_count)
ed_level_masters <- df[df$ed_level == "Master's degree","ed_level_count"] / sum(df$ed_level_count)
fig <- plot_ly() %>%
add_bars(
data = df,
x = df$ed_level_count,
y = df$ed_level
) %>%
layout(
title = "Education Level of Respondents",
yaxis = list(
categoryorder = "array",
categoryarray = df$ed_level_count
)
)
fig
query_str <- "
SELECT hobbyist, COUNT(hobbyist) AS hobbyist_count
FROM respondents
WHERE hobbyist IS NOT NULL
GROUP BY hobbyist;
"
df <- dbGetQuery(db_connection, query_str)
fig <- plot_ly() %>%
add_pie(
data = df,
labels = df$hobbyist,
values = df$hobbyist_count
) %>%
layout(title = "Do you code as a hobby?")
fig
query_str <- "
SELECT employment_status, COUNT(employment_status) AS employment_status_count
FROM respondents
WHERE employment_status IS NOT NULL
GROUP BY employment_status
ORDER BY employment_status_count DESC;
"
df <- dbGetQuery(db_connection, query_str)
df <- df[order(df$employment_status_count),]
fig <- plot_ly() %>%
add_bars(
data = df,
x = df$employment_status_count,
y = df$employment_status
) %>%
layout(
title = "Employment Status of Respondents",
yaxis = list(
categoryorder = "array",
categoryarray = df$employment_status_count
)
)
fig
query_str <- "
SELECT COUNT(DISTINCT id) AS num_respondents
FROM respondents;
"
num_respondents <- dbGetQuery(db_connection, query_str)[1, "num_respondents"]
query_str <- "
SELECT dev_role, COUNT(dev_role) AS dev_role_count
FROM dev_roles
WHERE dev_role IS NOT NULL
GROUP BY dev_role
ORDER BY dev_role_count DESC
LIMIT 10;
"
df <- dbGetQuery(db_connection, query_str)
df <- df[order(df$dev_role_count),]
# Relative frequencies for the key takeaways
dev_role_backend <- df[df$dev_role == "Back-end developer","dev_role_count"] / num_respondents
dev_role_fullstack <- df[df$dev_role == "Full-stack developer","dev_role_count"] / num_respondents
dev_role_frontend <- df[df$dev_role == "Front-end developer","dev_role_count"] / num_respondents
fig <- plot_ly() %>%
add_bars(
data = df,
x = df$dev_role_count,
y = df$dev_role
) %>%
layout(
title = "10 Most Common Developer Roles",
yaxis = list(
categoryorder = "array",
categoryarray = df$dev_role_count
)
)
fig
query_str <- "
SELECT yearly_compensation
FROM respondents
WHERE yearly_compensation BETWEEN 0 AND 200000;
"
df <- dbGetQuery(db_connection, query_str)
# round(df$yearly_compensation / length(df$yearly_compensation), 2)
bins <- list(start = 0, end = 210000, size = 15000)
fig <- plot_ly() %>%
add_histogram(
data = df,
x = df$yearly_compensation,
xbins = bins
) %>%
layout(
title = "Annual Compensation"
)
fig
query_str <- "
SELECT
prog_languages.prog_language AS prog_language,
AVG(respondents.yearly_compensation) AS compensation_avg
FROM prog_languages
INNER JOIN respondents
ON prog_languages.respondent_id = respondents.id
WHERE yearly_compensation BETWEEN 0 AND 200000
GROUP BY prog_language
ORDER BY compensation_avg DESC
LIMIT 20;
"
df <- dbGetQuery(db_connection, query_str)
df <- df[order(df$compensation_avg),]
fig <- plot_ly() %>%
add_bars(
data = df,
x = df$compensation_avg,
y = df$prog_language
) %>%
layout(
title = "Average Annual Compensation by Programming Language (Top 20)",
yaxis = list(
categoryorder = "array",
categoryarray = df$compensation_avg
)
)
fig
query_str <- "
SELECT
dev_roles.dev_role AS dev_role,
AVG(respondents.yearly_compensation) AS compensation_avg
FROM dev_roles
INNER JOIN respondents
ON dev_roles.respondent_id = respondents.id
WHERE
yearly_compensation BETWEEN 0 AND 200000
AND dev_role IS NOT NULL
GROUP BY dev_role
ORDER BY compensation_avg DESC;
"
df <- dbGetQuery(db_connection, query_str)
df <- df[order(df$compensation_avg),]
fig <- plot_ly() %>%
add_bars(
data = df,
x = df$compensation_avg,
y = df$dev_role
) %>%
layout(
title = "Average Annual Compensation by Developer Role (Top 20)",
yaxis = list(
categoryorder = "array",
categoryarray = df$compensation_avg
)
)
fig
query_str <- "
SELECT work_week_hrs
FROM respondents
WHERE work_week_hrs BETWEEN 0 AND 100;
"
df <- dbGetQuery(db_connection, query_str)
# round(df$yearly_compensation / length(df$yearly_compensation), 2)
bins <- list(start = 0, end = 100, size = 10)
fig <- plot_ly() %>%
add_histogram(
data = df,
x = df$work_week_hrs,
xbins = bins
) %>%
layout(
title = "Hours of Work per Week"
)
fig
query_str <- "
SELECT overtime, COUNT(overtime) AS overtime_count
FROM respondents
WHERE overtime IS NOT NULL
GROUP BY overtime;
"
df <- dbGetQuery(db_connection, query_str)
fig <- plot_ly() %>%
add_pie(
data = df,
labels = df$overtime,
values = df$overtime_count
) %>%
layout(title = "Overtime Work Frequency")
fig
query_str <- "
SELECT undegrad_major, COUNT(undegrad_major) AS undegrad_major_count
FROM respondents
WHERE undegrad_major IS NOT NULL
GROUP BY undegrad_major
ORDER BY undegrad_major_count DESC;
"
df <- dbGetQuery(db_connection, query_str)
df <- df[order(df$undegrad_major_count),]
fig <- plot_ly() %>%
add_bars(
data = df,
x = df$undegrad_major_count,
y = df$undegrad_major
) %>%
layout(
title = "Undergrad Major of Respondents",
yaxis = list(
categoryorder = "array",
categoryarray = df$undegrad_major_count
)
)
fig
query_str <- "
SELECT stuck_solution, COUNT(stuck_solution) AS stuck_solution_count
FROM stuck_solutions
WHERE stuck_solution IS NOT NULL
GROUP BY stuck_solution
ORDER BY stuck_solution_count DESC;
"
df <- dbGetQuery(db_connection, query_str)
fig <- plot_ly() %>%
add_bars(
data = df,
x = df$stuck_solution,
y = df$stuck_solution_count
) %>%
layout(
title = "Solutions Used When Stuck",
xaxis = list(
categoryorder = "array",
categoryarray = df$stuck_solution
)
)
fig
query_str <- "
SELECT COUNT(DISTINCT id) AS num_respondents
FROM respondents;
"
num_respondents <- dbGetQuery(db_connection, query_str)[1, "num_respondents"]
query_str <- "
SELECT job_factor, COUNT(job_factor) AS job_factor_count
FROM job_factors
GROUP BY job_factor
ORDER BY job_factor_count DESC
LIMIT 10;
"
df <- dbGetQuery(db_connection, query_str)
# Relative frequencies for the key takeaways
job_factor_tech <- df[df$job_factor == "Technical compatibility","job_factor_count"] / num_respondents
job_factor_culture <- df[df$job_factor == "Company culture","job_factor_count"] / num_respondents
job_factor_schedule <- df[df$job_factor == "Flexible schedule","job_factor_count"] / num_respondents
fig <- plot_ly() %>%
add_bars(
data = df,
x = df$job_factor,
y = df$job_factor_count
) %>%
layout(
title = "10 Most Important Job Factors",
xaxis = list(
categoryorder = "array",
categoryarray = df$job_factor_count
)
)
fig
dbDisconnect(db_connection)
## [1] TRUE